library('dplyr') # data manipulation
library('reshape2') # data manipulation
library('ggplot2') # visualization
library('knitr')
library('plotly') # visualization
library('corrplot') # package corrplot
library('caret') # data mining
library('zoo') # NA values approximation
ds <- read.csv(url("http://www.cs.put.poznan.pl/dbrzezinski/teaching/zed/sledzie.csv"),na.strings = c("NA","NaN", " ","?") )
ds <- tbl_df(ds)
X may be considered as main Id and indicates chronological order. The lower the value the older is the sample.
| Variable Name | Description | Units |
|---|---|---|
| length | length of herring | [cm] |
| cfin1 | accessibility of plankton | [concentration of Calanus finmarchicus spec. 1]; |
| cfin2 | accessibility of plankton | [concentration of Calanus finmarchicus spec. 2]; |
| chel1 | accessibility of plankton | [concentration of Calanus helgolandicus spec. 1]; |
| chel2 | accessibility of plankton | [concentration of Calanus helgolandicus spec. 2]; |
| lcop1 | accessibility of plankton | [concentration of Copepoda spec. 1]; |
| lcop2 | accessibility of plankton | [concentration of Copepoda spec. 2]; |
| fbar | intensity of fishing in region | [fraction of preserved fry]; |
| recr | annual fry | [number of herrings]; |
| cumf | total annual intensity of fishing in region | [fraction of preserved fry]; |
| totaln | total number of fishes catched during a single fishing | [number of herrings]; |
| sst | temperature of the water at the surface | [°C]; |
| sal | level of salinity | [Knudsen ppt]; |
| xmonth | month of fishing | [number of month]; |
| nao | North Atlantic oscillation (difference of atmospheric pressure at sea level between the Icelandic low and the Azores high) | [mb] |
Empty values are not significant.
knitr::kable(summary(ds))
| X | length | cfin1 | cfin2 | chel1 | chel2 | lcop1 | lcop2 | fbar | recr | cumf | totaln | sst | sal | xmonth | nao | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 0 | Min. :19.0 | Min. : 0.0000 | Min. : 0.0000 | Min. : 0.000 | Min. : 5.238 | Min. : 0.3074 | Min. : 7.849 | Min. :0.0680 | Min. : 140515 | Min. :0.06833 | Min. : 144137 | Min. :12.77 | Min. :35.40 | Min. : 1.000 | Min. :-4.89000 | |
| 1st Qu.:13145 | 1st Qu.:24.0 | 1st Qu.: 0.0000 | 1st Qu.: 0.2778 | 1st Qu.: 2.469 | 1st Qu.:13.427 | 1st Qu.: 2.5479 | 1st Qu.:17.808 | 1st Qu.:0.2270 | 1st Qu.: 360061 | 1st Qu.:0.14809 | 1st Qu.: 306068 | 1st Qu.:13.60 | 1st Qu.:35.51 | 1st Qu.: 5.000 | 1st Qu.:-1.89000 | |
| Median :26291 | Median :25.5 | Median : 0.1111 | Median : 0.7012 | Median : 5.750 | Median :21.673 | Median : 7.0000 | Median :24.859 | Median :0.3320 | Median : 421391 | Median :0.23191 | Median : 539558 | Median :13.86 | Median :35.51 | Median : 8.000 | Median : 0.20000 | |
| Mean :26291 | Mean :25.3 | Mean : 0.4458 | Mean : 2.0248 | Mean :10.006 | Mean :21.221 | Mean : 12.8108 | Mean :28.419 | Mean :0.3304 | Mean : 520367 | Mean :0.22981 | Mean : 514973 | Mean :13.87 | Mean :35.51 | Mean : 7.258 | Mean :-0.09236 | |
| 3rd Qu.:39436 | 3rd Qu.:26.5 | 3rd Qu.: 0.3333 | 3rd Qu.: 1.7936 | 3rd Qu.:11.500 | 3rd Qu.:27.193 | 3rd Qu.: 21.2315 | 3rd Qu.:37.232 | 3rd Qu.:0.4560 | 3rd Qu.: 724151 | 3rd Qu.:0.29803 | 3rd Qu.: 730351 | 3rd Qu.:14.16 | 3rd Qu.:35.52 | 3rd Qu.: 9.000 | 3rd Qu.: 1.63000 | |
| Max. :52581 | Max. :32.5 | Max. :37.6667 | Max. :19.3958 | Max. :75.000 | Max. :57.706 | Max. :115.5833 | Max. :68.736 | Max. :0.8490 | Max. :1565890 | Max. :0.39801 | Max. :1015595 | Max. :14.73 | Max. :35.61 | Max. :12.000 | Max. : 5.08000 | |
| NA | NA | NA’s :1581 | NA’s :1536 | NA’s :1555 | NA’s :1556 | NA’s :1653 | NA’s :1591 | NA | NA | NA | NA | NA’s :1584 | NA | NA | NA |
d <- melt(ds[,-c(1)])
## No id variables; using all as measure variables
ggplot(d,aes(x = value)) +
facet_wrap(~variable,scales = "free_x") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- ggplot(ds,aes(X,length)) + geom_point(alpha=0.04) + geom_smooth(method="auto", se=TRUE, color="red")
plt1 <- htmltools::tagList()
plt1 <- as.widget(ggplotly(p1))
## `geom_smooth()` using method = 'gam'
plt1
p2 <- ggplot(ds,aes(X,length, group = xmonth)) + geom_point(stat = "identity",alpha=0.03 ) + geom_smooth(method="auto", se=TRUE,color="red") + facet_grid(~xmonth, scales = "fixed") + facet_wrap(~ xmonth, ncol = 3,labeller = month_labeller) + theme_minimal()
plt2 <- htmltools::tagList()
plt2 <- as.widget(ggplotly(p2))
## `geom_smooth()` using method = 'gam'
plt2
Data set is split into training and test data set.
set.seed(1993)
ds2 <- ds %>% select(-c(X))
inTraining <-
createDataPartition(
y = ds2$length,
p = .75,
list = FALSE)
training <- ds2[ inTraining,]
testing <- ds2[-inTraining,]
Just by looking through the data low standard deviation per month for every attribute can be noticed. Many values seems to be duplicated or changed insignificantly. Simple approximation is used to determine missing values. Every row where value of NA was not determined by this method is deleted. Testing set is reduced to complete cases only.
set.seed(1993)
for(i in 1:ncol(training)){
if(anyNA(ds2[,i])){
training[,i] <- na.approx(training[,i],na.rm = F,maxgap = 5)
}
}
training <- training[complete.cases(training),]
testing <- testing[complete.cases(testing),]
knitr::kable(summary(training))
| length | cfin1 | cfin2 | chel1 | chel2 | lcop1 | lcop2 | fbar | recr | cumf | totaln | sst | sal | xmonth | nao | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :19.00 | Min. : 0.0000 | Min. : 0.0000 | Min. : 0.000 | Min. : 5.238 | Min. : 0.3074 | Min. : 7.849 | Min. :0.0680 | Min. : 140515 | Min. :0.06833 | Min. : 144137 | Min. :12.77 | Min. :35.40 | Min. : 1.000 | Min. :-4.89000 | |
| 1st Qu.:24.00 | 1st Qu.: 0.0000 | 1st Qu.: 0.2778 | 1st Qu.: 2.469 | 1st Qu.:13.427 | 1st Qu.: 2.5479 | 1st Qu.:17.808 | 1st Qu.:0.2270 | 1st Qu.: 360061 | 1st Qu.:0.14809 | 1st Qu.: 306068 | 1st Qu.:13.60 | 1st Qu.:35.51 | 1st Qu.: 5.000 | 1st Qu.:-1.89000 | |
| Median :25.50 | Median : 0.1111 | Median : 0.7012 | Median : 5.750 | Median :21.435 | Median : 7.0000 | Median :24.859 | Median :0.3320 | Median : 421391 | Median :0.23191 | Median : 539558 | Median :13.86 | Median :35.51 | Median : 8.000 | Median : 0.20000 | |
| Mean :25.31 | Mean : 0.4419 | Mean : 2.0100 | Mean :10.015 | Mean :21.198 | Mean : 12.8117 | Mean :28.386 | Mean :0.3306 | Mean : 519930 | Mean :0.22965 | Mean : 515111 | Mean :13.87 | Mean :35.51 | Mean : 7.257 | Mean :-0.09418 | |
| 3rd Qu.:26.50 | 3rd Qu.: 0.3333 | 3rd Qu.: 1.7936 | 3rd Qu.:11.500 | 3rd Qu.:27.193 | 3rd Qu.: 21.2315 | 3rd Qu.:37.232 | 3rd Qu.:0.4650 | 3rd Qu.: 724151 | 3rd Qu.:0.29803 | 3rd Qu.: 730351 | 3rd Qu.:14.21 | 3rd Qu.:35.52 | 3rd Qu.: 9.000 | 3rd Qu.: 1.63000 | |
| Max. :32.50 | Max. :37.6667 | Max. :19.3958 | Max. :75.000 | Max. :57.706 | Max. :115.5833 | Max. :68.736 | Max. :0.8490 | Max. :1565890 | Max. :0.39801 | Max. :1015595 | Max. :14.73 | Max. :35.61 | Max. :12.000 | Max. : 5.08000 |
knitr::kable(summary(testing))
| length | cfin1 | cfin2 | chel1 | chel2 | lcop1 | lcop2 | fbar | recr | cumf | totaln | sst | sal | xmonth | nao | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :19.50 | Min. : 0.0000 | Min. : 0.0000 | Min. : 0.000 | Min. : 5.238 | Min. : 0.3074 | Min. : 7.849 | Min. :0.0680 | Min. : 140515 | Min. :0.06833 | Min. : 144137 | Min. :12.77 | Min. :35.40 | Min. : 1.000 | Min. :-4.8900 | |
| 1st Qu.:24.00 | 1st Qu.: 0.0000 | 1st Qu.: 0.2671 | 1st Qu.: 2.469 | 1st Qu.:13.427 | 1st Qu.: 2.5479 | 1st Qu.:17.808 | 1st Qu.:0.2270 | 1st Qu.: 360061 | 1st Qu.:0.14809 | 1st Qu.: 306068 | 1st Qu.:13.63 | 1st Qu.:35.51 | 1st Qu.: 6.000 | 1st Qu.:-1.8900 | |
| Median :25.50 | Median : 0.1111 | Median : 0.7012 | Median : 5.750 | Median :21.673 | Median : 7.0717 | Median :24.859 | Median :0.3320 | Median : 421391 | Median :0.23191 | Median : 539558 | Median :13.86 | Median :35.51 | Median : 8.000 | Median : 0.2000 | |
| Mean :25.31 | Mean : 0.4526 | Mean : 2.0474 | Mean : 9.948 | Mean :21.251 | Mean : 12.7925 | Mean :28.450 | Mean :0.3297 | Mean : 521438 | Mean :0.23005 | Mean : 514946 | Mean :13.88 | Mean :35.51 | Mean : 7.252 | Mean :-0.0918 | |
| 3rd Qu.:26.50 | 3rd Qu.: 0.3333 | 3rd Qu.: 1.7936 | 3rd Qu.:11.500 | 3rd Qu.:27.193 | 3rd Qu.: 21.2315 | 3rd Qu.:37.232 | 3rd Qu.:0.4560 | 3rd Qu.: 724151 | 3rd Qu.:0.29803 | 3rd Qu.: 730351 | 3rd Qu.:14.16 | 3rd Qu.:35.52 | 3rd Qu.: 9.000 | 3rd Qu.: 1.6300 | |
| Max. :32.00 | Max. :37.6667 | Max. :19.3958 | Max. :75.000 | Max. :57.706 | Max. :115.5833 | Max. :68.736 | Max. :0.8490 | Max. :1565890 | Max. :0.39801 | Max. :1015595 | Max. :14.73 | Max. :35.61 | Max. :12.000 | Max. : 5.0800 |
Length attribute seems to be negatively correlated to time and North Atlantic oscillation but is positively correlated to intensity of fishing in region. Moreover length is negatively correlated to temperature of the water at the surface. Other correlations confirms validity of data. For example total number of fishes catched during a single fishing is correlated to total annual intensity of fishing in region.
knitr::kable(cor(training))
| X | length | cfin1 | cfin2 | chel1 | chel2 | lcop1 | lcop2 | fbar | recr | cumf | totaln | sst | sal | xmonth | nao | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X | 1.0000000 | -0.3384877 | -0.1497344 | 0.0625020 | -0.1655801 | 0.0571219 | -0.2290329 | 0.0423353 | 0.0965635 | 0.0006595 | 0.2353515 | -0.3626289 | 0.3613572 | -0.0520178 | -0.0031134 | 0.4127596 |
| length | -0.3384877 | 1.0000000 | 0.0805045 | 0.0939675 | 0.2196150 | -0.0127203 | 0.2362724 | 0.0483279 | 0.2534052 | -0.0115559 | 0.0080335 | 0.0922081 | -0.4529849 | 0.0288068 | 0.0142225 | -0.2612139 |
| cfin1 | -0.1497344 | 0.0805045 | 1.0000000 | 0.1543359 | 0.0872516 | 0.1998542 | 0.1113421 | 0.2107115 | -0.0665129 | 0.1159156 | -0.0494736 | 0.1298700 | 0.0114949 | 0.1308075 | 0.0057650 | 0.0084740 |
| cfin2 | 0.0625020 | 0.0939675 | 0.1543359 | 1.0000000 | -0.0047987 | 0.3046562 | -0.0408257 | 0.6495371 | 0.1493054 | -0.0939349 | 0.3315717 | -0.2125897 | -0.2313141 | -0.0810768 | 0.0124063 | -0.0005043 |
| chel1 | -0.1655801 | 0.2196150 | 0.0872516 | -0.0047987 | 1.0000000 | 0.2854820 | 0.9556356 | 0.2480208 | 0.1630038 | -0.0499888 | 0.0695477 | 0.1659248 | -0.2149789 | -0.1486619 | 0.0452955 | -0.5066948 |
| chel2 | 0.0571219 | -0.0127203 | 0.1998542 | 0.3046562 | 0.2854820 | 1.0000000 | 0.1752773 | 0.8845860 | 0.0280526 | 0.0009182 | 0.2614306 | -0.3766883 | 0.0126750 | -0.2206494 | 0.0682363 | -0.0546246 |
| lcop1 | -0.2290329 | 0.2362724 | 0.1113421 | -0.0408257 | 0.9556356 | 0.1752773 | 1.0000000 | 0.1511895 | 0.0989205 | 0.0001623 | -0.0094285 | 0.2648605 | -0.2638250 | -0.1020682 | 0.0319319 | -0.5520941 |
| lcop2 | 0.0423353 | 0.0483279 | 0.2107115 | 0.6495371 | 0.2480208 | 0.8845860 | 0.1511895 | 1.0000000 | 0.0510242 | 0.0020792 | 0.2874762 | -0.3007467 | -0.1143536 | -0.1826004 | 0.0587263 | -0.0394735 |
| fbar | 0.0965635 | 0.2534052 | -0.0665129 | 0.1493054 | 0.1630038 | 0.0280526 | 0.0989205 | 0.0510242 | 1.0000000 | -0.2382844 | 0.8181077 | -0.5087197 | -0.1747903 | 0.0422798 | 0.0059388 | 0.0623021 |
| recr | 0.0006595 | -0.0115559 | 0.1159156 | -0.0939349 | -0.0499888 | 0.0009182 | 0.0001623 | 0.0020792 | -0.2382844 | 1.0000000 | -0.2595413 | 0.3705363 | -0.2014227 | 0.2745302 | 0.0218885 | 0.0917721 |
| cumf | 0.2353515 | 0.0080335 | -0.0494736 | 0.3315717 | 0.0695477 | 0.2614306 | -0.0094285 | 0.2874762 | 0.8181077 | -0.2595413 | 1.0000000 | -0.7077117 | 0.0345575 | -0.0993258 | 0.0341512 | 0.2248259 |
| totaln | -0.3626289 | 0.0922081 | 0.1298700 | -0.2125897 | 0.1659248 | -0.3766883 | 0.2648605 | -0.3007467 | -0.5087197 | 0.3705363 | -0.7077117 | 1.0000000 | -0.2887172 | 0.1465778 | -0.0274147 | -0.3908094 |
| sst | 0.3613572 | -0.4529849 | 0.0114949 | -0.2313141 | -0.2149789 | 0.0126750 | -0.2638250 | -0.1143536 | -0.1747903 | -0.2014227 | 0.0345575 | -0.2887172 | 1.0000000 | 0.0124690 | -0.0095941 | 0.5106722 |
| sal | -0.0520178 | 0.0288068 | 0.1308075 | -0.0810768 | -0.1486619 | -0.2206494 | -0.1020682 | -0.1826004 | 0.0422798 | 0.2745302 | -0.0993258 | 0.1465778 | 0.0124690 | 1.0000000 | -0.0230910 | 0.1287458 |
| xmonth | -0.0031134 | 0.0142225 | 0.0057650 | 0.0124063 | 0.0452955 | 0.0682363 | 0.0319319 | 0.0587263 | 0.0059388 | 0.0218885 | 0.0341512 | -0.0274147 | -0.0095941 | -0.0230910 | 1.0000000 | -0.0035912 |
| nao | 0.4127596 | -0.2612139 | 0.0084740 | -0.0005043 | -0.5066948 | -0.0546246 | -0.5520941 | -0.0394735 | 0.0623021 | 0.0917721 | 0.2248259 | -0.3908094 | 0.5106722 | 0.1287458 | -0.0035912 | 1.0000000 |
corrplot(cor(training))
Building regression model is checked with two measures RMSE and R^2. Best regression model was built using random forest algorithm.
set.seed(1993)
ctrl <- trainControl(
method = "repeatedcv",
number = 2,
repeats = 5)
set.seed(1993)
rf_fit <- train(length ~ .,
data = training,
method = "rf",
trControl = ctrl,
importance = TRUE,
ntree = 5)
rf_pred <- predict(rf_fit, testing)
rf_fit
## Random Forest
##
## 39437 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 19719, 19718, 19719, 19718, 19719, 19718, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 1.139878 0.5256267
## 8 1.117765 0.5471589
## 15 1.211110 0.4952169
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 8.
postResample(pred = rf_pred, obs = testing$length)
## RMSE Rsquared
## 1.1004934 0.5571783
rf_fit2 <- train(length ~ .,
data = training,
method = "rf",
trControl = ctrl,
importance = TRUE,
ntree = 15)
rf_pred2 <- predict(rf_fit2, testing)
rf_fit2
## Random Forest
##
## 39437 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 19718, 19719, 19718, 19719, 19719, 19718, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 1.134458 0.5302668
## 8 1.098260 0.5610723
## 15 1.185089 0.5093636
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 8.
postResample(pred = rf_pred2, obs = testing$length)
## RMSE Rsquared
## 1.0897997 0.5647872
Results for ntree = 5
imp1 <- varImp(rf_fit$finalModel, scale=FALSE)
knitr::kable(imp1)
| Overall | |
|---|---|
| X | 1.7082357 |
| cfin1 | 0.4006010 |
| cfin2 | 0.4523616 |
| chel1 | 0.2752032 |
| chel2 | 0.3707563 |
| lcop1 | 0.5228276 |
| lcop2 | 0.4337889 |
| fbar | 0.5993601 |
| recr | 0.5960982 |
| cumf | 0.4100346 |
| totaln | 0.6370218 |
| sst | 2.0892154 |
| sal | 0.0795844 |
| xmonth | 0.4957700 |
| nao | 0.1783721 |
Results for ntree = 15
imp2 <- varImp(rf_fit2$finalModel, scale=FALSE)
knitr::kable(imp2)
| Overall | |
|---|---|
| X | 1.4489873 |
| cfin1 | 0.4189872 |
| cfin2 | 0.4587345 |
| chel1 | 0.3251824 |
| chel2 | 0.5871283 |
| lcop1 | 0.5511939 |
| lcop2 | 0.4827812 |
| fbar | 0.6947083 |
| recr | 0.5904054 |
| cumf | 0.2609754 |
| totaln | 0.4123031 |
| sst | 1.9644069 |
| sal | 0.2135060 |
| xmonth | 0.4683507 |
| nao | 0.1255477 |
The attributes with impact for length of herrings are temperature of the water at the surface and north Atlantic oscillation.